{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Loading round1_test_dataset requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m----> 4\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mround1_test_dataset.py\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      5\u001b[0m dataset[\u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m40\u001b[39m]\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# sdataset = dataset.train_test_split(test_size=0.1)\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;66;03m# train_dataset = sdataset[\"train\"]\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# val_dataset = sdataset[\"test\"]\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# train_dataset.to_json(\"train_dataset.json\",mode=\"w\")\u001b[39;00m\n\u001b[1;32m     25\u001b[0m \u001b[38;5;66;03m# val_dataset.to_json(\"val_dataset.json\",mode=\"w\")\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/torch24/lib/python3.11/site-packages/datasets/load.py:2132\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[1;32m   2127\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m VerificationMode(\n\u001b[1;32m   2128\u001b[0m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mBASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS\n\u001b[1;32m   2129\u001b[0m )\n\u001b[1;32m   2131\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[0;32m-> 2132\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2133\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2134\u001b[0m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2135\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2136\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2137\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2138\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2139\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2140\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2141\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2142\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2143\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2144\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2145\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_default_config_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2146\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2147\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2149\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[1;32m   2150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
      "File \u001b[0;32m~/miniconda3/envs/torch24/lib/python3.11/site-packages/datasets/load.py:1853\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[0;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)\u001b[0m\n\u001b[1;32m   1851\u001b[0m     download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[1;32m   1852\u001b[0m     download_config\u001b[38;5;241m.\u001b[39mstorage_options\u001b[38;5;241m.\u001b[39mupdate(storage_options)\n\u001b[0;32m-> 1853\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1854\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1855\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1856\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1857\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1858\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1859\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1860\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1861\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1862\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_default_config_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_require_default_config_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1863\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_custom_configs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mbool\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1864\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1865\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[1;32m   1866\u001b[0m builder_kwargs \u001b[38;5;241m=\u001b[39m dataset_module\u001b[38;5;241m.\u001b[39mbuilder_kwargs\n",
      "File \u001b[0;32m~/miniconda3/envs/torch24/lib/python3.11/site-packages/datasets/load.py:1571\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)\u001b[0m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(filename):\n\u001b[1;32m   1565\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(path):\n\u001b[1;32m   1566\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mLocalDatasetModuleFactoryWithScript\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1567\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1568\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1569\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdynamic_modules_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdynamic_modules_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1570\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m-> 1571\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1572\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1573\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/miniconda3/envs/torch24/lib/python3.11/site-packages/datasets/load.py:774\u001b[0m, in \u001b[0;36mLocalDatasetModuleFactoryWithScript.get_module\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    763\u001b[0m         _create_importable_file(\n\u001b[1;32m    764\u001b[0m             local_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpath,\n\u001b[1;32m    765\u001b[0m             local_imports\u001b[38;5;241m=\u001b[39mlocal_imports,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    771\u001b[0m             download_mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_mode,\n\u001b[1;32m    772\u001b[0m         )\n\u001b[1;32m    773\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 774\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    775\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLoading \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m requires you to execute the dataset script in that\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    776\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m repo on your local machine. Make sure you have read the code there to avoid malicious use, then\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    777\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m set the option `trust_remote_code=True` to remove this error.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    778\u001b[0m         )\n\u001b[1;32m    779\u001b[0m _check_library_imports(name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname, library_imports\u001b[38;5;241m=\u001b[39mlibrary_imports)\n\u001b[1;32m    780\u001b[0m module_path, \u001b[38;5;28mhash\u001b[39m \u001b[38;5;241m=\u001b[39m _load_importable_file(\n\u001b[1;32m    781\u001b[0m     dynamic_modules_path\u001b[38;5;241m=\u001b[39mdynamic_modules_path,\n\u001b[1;32m    782\u001b[0m     module_namespace\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdatasets\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    783\u001b[0m     subdirectory_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mhash\u001b[39m,\n\u001b[1;32m    784\u001b[0m     name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m    785\u001b[0m )\n",
      "\u001b[0;31mValueError\u001b[0m: Loading round1_test_dataset requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error."
     ]
    }
   ],
   "source": [
    "\n",
    "import json\n",
    "import datasets\n",
    "from datasets import load_dataset\n",
    "dataset = load_dataset(\"round1_test_dataset.py\",split=\"train\",trust_remote_code=True)\n",
    "dataset[0:40]\n",
    "# sdataset = dataset.train_test_split(test_size=0.1)\n",
    "# train_dataset = sdataset[\"train\"]\n",
    "# val_dataset = sdataset[\"test\"]\n",
    "  \n",
    "# # 将数据集以 JSON Lines 格式保存到文件，保持中文字符原样  \n",
    "# with open('train_dataset.json', 'w', encoding='utf-8') as f:  \n",
    "#     for example in train_dataset:  \n",
    "#         json_str = json.dumps(example, ensure_ascii=False)  # 注意这里添加了 ensure_ascii=False  \n",
    "#         f.write(json_str + '\\n')\n",
    "\n",
    "# with open('val_dataset.json', 'w', encoding='utf-8') as f:  \n",
    "#     for example in val_dataset:  \n",
    "#         json_str = json.dumps(example, ensure_ascii=False)  # 注意这里添加了 ensure_ascii=False  \n",
    "#         f.write(json_str + '\\n')\n",
    "\n",
    "# print(len(train_dataset))\n",
    "# print(len(val_dataset))\n",
    "\n",
    "# train_dataset.to_json(\"train_dataset.json\",mode=\"w\")\n",
    "# val_dataset.to_json(\"val_dataset.json\",mode=\"w\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch24",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
